phonosaurus('the phonological thesaurus')

In some of my other posts I created a function that computes the phonological similarity between words using the package FuzzyWuzzy in combination with a function I made myself which generates phonological feature tiers for words in Indonesian. The phonological similarity function can be tweaked a bit to create a sound similarity dictionary which is passed a string and returns the word or words in the language which are most similar. This sort of 'sound thesaurus' is fun to play with and could have numerous applications (beyond the finding placename etymology, the purpose for which I originally created it.) The function looks for the most similar word in Indonesian, but it could easily be adapted for English.



In [1]:

    
#I won't discuss the code below at length because I have discussed it in previous posts



In [2]:

    
import pandas as pd
import numpy as np
# importing dictionary
kbbi = pd.read_csv("/Users/admin/Desktop/loanwords/clean.kbbi.csv")
kbbi.columns = ['old_index', 'words']
kbbi['words'] = kbbi['words'].apply(lambda x: str(x))

Generating a matrix of phonological features:



In [3]:

    
def phono_matrix(string):
    string_of_matrixes = []
    for character in string:
        matrix = {}
###populate manner: 
    ### sonorant
        if character in ['a','e','i','o','u','y','w','m','N','Y','l','r','h','q']:
            matrix['sonorant'] = 'Y'
        else:
            matrix['sonorant'] = 'N'
    
    ###continuant
        if character in ['l','r','y','w','a','e','i','o','u','s','z','f']:
            matrix['continuant'] = 'Y'
        else:
            matrix['continuant'] = 'N'
    
    ###consonant
        if character in ['p','t','k','q','h','c','b','d','g','j','s','z','f','m','n','Y','N','l','r']:
            matrix['consonant'] = 'Y'
        else:
            matrix['consonant'] = 'N'
    
    ###syllabic
        if character in ['a','e','i','o','u']:
            matrix['syllabic'] = 'Y'
        else:
            matrix['syllabic'] = 'N'

    ###strident
        if character in ['s','j','c']:
            matrix['strident'] = 'Y'
        else:
            matrix['strident'] = 'N'
    
###populate place: labial, coronal, palatal, velar, glottal 
    ###labial
        if character in ['p','m','f','b','w','u','o']:
            matrix['labial'] = 'Y'
        else:
            matrix['labial'] = 'N'   

    ###coronal
        if character in ['t','d','n','s','j','c','Y','i','e','r','l']:
            matrix['coronal'] = 'Y'
        else:
            matrix['coronal'] = 'N' 
 
    ###palatal
        if character in ['s','j','c','i','Y','e']:
            matrix['palatal'] = 'Y'
        else:
            matrix['palatal'] = 'N'
    ###palatal
        if character in ['u','k','g','N','o']:
            matrix['velar'] = 'Y'
        else:
            matrix['velar'] = 'N'
    ###glottal
        if character in ['h','q']:
            matrix['glottal'] = 'Y'
        else:
            matrix['glottal'] = 'N'

    ###glottal
        if character in ['h','q']:
            matrix['glottal'] = 'Y'
        else:
            matrix['glottal'] = 'N'
###nasality
    ###nasal/oral
        if character in ['m','n','Y','N']:
            matrix['nasal'] = 'Y'
        else:
            matrix['nasal'] = 'N'
            
###populate obstruent voicing 
###i assume that [voice] is only phonologically active in obstruents
    ###voiced/voiceless obstruent
        if character in ['b','d','g','j']:
            matrix['voice'] = 'Y'
        else:
            matrix['voice'] = 'N'          
            
### populate lateral/rhotic
    ###lateral
        if character == 'l':
            matrix['lateral'] = 'Y'
        else:
            matrix['lateral'] = 'N'
    ###rhotic
        if character == 'r':
            matrix['rhotic'] = 'Y'
        else:
            matrix['rhotic'] = 'N'

###populate vowel height
###I assume at mid is not an active feature
    ### high
        if character in ['i','u']:
            matrix['high'] = 'Y'
        else:
            matrix['high'] = 'N'
    ### low
        if character == 'a':
            matrix['low'] = 'Y'
        else:
            matrix['low'] = 'N'
        
        string_of_matrixes.append(matrix)
    return(string_of_matrixes)



In [4]:

    
# let's build phonological matrixes
kbbi['matrixes'] = kbbi.words.apply(lambda x: phono_matrix(x))



In [5]:

    
def tier_builder(string_of_matrixes):
    features = ['sonorant','consonant','continuant','syllabic','strident','labial','coronal','palatal','velar', 'glottal','nasal','voice','lateral','rhotic']
    tier_dictionary = {}
    for feature in features:
        tier_dictionary[feature] = str()
        for matrix in string_of_matrixes:
            if matrix[feature] is not None:
                tier_dictionary[feature] = tier_dictionary[feature] + matrix[feature]
    return(tier_dictionary)



In [6]:

    
kbbi['tiers'] = kbbi.matrixes.apply(lambda x: tier_builder(x))



In [7]:

    
from fuzzywuzzy import fuzz, StringMatcher
import difflib

def similarity(word1_tiers,word2_tiers):
    features = ['sonorant','continuant','syllabic','strident','labial','coronal','palatal','velar', 'glottal','nasal','voice','lateral','rhotic']  
    tier_similarity = {}
    for feature in features:
        tier_similarity[feature] = fuzz.ratio(word1_tiers[feature],word2_tiers[feature])
    tier_similarity = pd.Series(tier_similarity)
    return(tier_similarity.mean())



In [8]:

    
def phonosaurus(word,threshold=0): # a higher threshold means you permit more words
    matrix = phono_matrix(word)
    tier = tier_builder(matrix)
    indexes = []
    top_score = 0
    best_matches = []
    for i in range(0,len(kbbi['tiers'])):
        tier1 = kbbi['tiers'][i]
        ratio = similarity(tier1,tier)
        if ratio > top_score and ratio < 100.0:
            top_score = ratio
            indexes = []
            indexes.append(kbbi['words'][i])
        elif ratio > top_score-threshold and ratio < top_score:
            indexes.append(kbbi['words'][i])
        else:
            pass
    return "Most similar: " + ', '.join(indexes),"Top score: " + str(top_score), "Threshold score: " + str(top_score-threshold)



In [ ]:



In [ ]:



In [ ]:



In [16]:

    
phonosaurus('ajon')









    Out[16]:





('Most similar: ijon',
 'Top score: 96.1538461538',
 'Threshold score: 96.1538461538')



In [17]:

    
phonosaurus('ajon',2) # increasing the threshold allows a few more near matches









    Out[17]:





('Most similar: ijon, acan',
 'Top score: 96.1538461538',
 'Threshold score: 94.1538461538')



In [11]:

    
phonosaurus('dodol',5)









    Out[11]:





('Most similar: judul, godot, dabol, modul, bogol, bedol, pudur, dabal, kibul, kedul, kobol, tolol, kebul, tukul, duduq, dudus, botol, botor, tuyul, tugal, tutut, tutur, bakul, kubul, kabul, potol, jojol, budur, babal, dadal, getol, katul, badal, bagal, bebal, dagel, dobel, total, togel, bobol, gubal, dubur, tugur, dugal, bodor, nuzul, tagal, kidul, donor, tukal, tuduh, bubul, gajul, Digul, tujul, begal, takol, dodot, dodos, dodoq, betul, bagul, butul, kojol, bajul, badur, nonol, tutor, gagal, dogel, dulur',
 'Top score: 96.9230769231',
 'Threshold score: 91.9230769231')



In [ ]: